In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['patch.force_edgecolor'] = True
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [27]:
filepath = 'https://raw.githubusercontent.com/jubins/ML-TwitterBotDetection/master/FinalProjectAndCode/kaggle_data/'
file= filepath+'training_data_2_csv_UTF.csv'
training_data = pd.read_csv(file)
bots = training_data[training_data.bot==1]
nonbots = training_data[training_data.bot==0]
In [10]:
def get_heatmap(df):
#This function gives heatmap of all NaN values
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.tight_layout()
return plt.show()
get_heatmap(training_data)
In [11]:
bots.friends_count/bots.followers_count
plt.figure(figsize=(10,5))
plt.subplot(2,1,1)
plt.title('Bots Friends vs Followers')
sns.regplot(bots.friends_count, bots.followers_count, color='red', label='Bots')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.tight_layout()
plt.subplot(2,1,2)
plt.title('NonBots Friends vs Followers')
sns.regplot(nonbots.friends_count, nonbots.followers_count, color='blue', label='NonBots')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.tight_layout()
plt.show()
In [12]:
bots['friends_by_followers'] = bots.friends_count/bots.followers_count
bots[bots.friends_by_followers<1].shape
nonbots['friends_by_followers'] = nonbots.friends_count/nonbots.followers_count
nonbots[nonbots.friends_by_followers<1].shape
Out[12]:
In [14]:
plt.figure(figsize=(10,5))
plt.plot(bots.listed_count, color='red', label='Bots')
plt.plot(nonbots.listed_count, color='blue', label='NonBots')
plt.legend(loc='upper left')
plt.ylim(10000,20000)
print(bots[(bots.listed_count<5)].shape)
In [19]:
bots_listed_count_df = bots[bots.listed_count<16000]
nonbots_listed_count_df = nonbots[nonbots.listed_count<16000]
bots_verified_df = bots_listed_count_df[bots_listed_count_df.verified==False]
bots_screenname_has_bot_df_ = bots_verified_df[(bots_verified_df.screen_name.str.contains("bot", case=False)==True)].shape
In [20]:
plt.figure(figsize=(12,7))
plt.subplot(2,1,1)
plt.plot(bots_listed_count_df.friends_count, color='red', label='Bots Friends')
plt.plot(nonbots_listed_count_df.friends_count, color='blue', label='NonBots Friends')
plt.legend(loc='upper left')
plt.subplot(2,1,2)
plt.plot(bots_listed_count_df.followers_count, color='red', label='Bots Followers')
plt.plot(nonbots_listed_count_df.followers_count, color='blue', label='NonBots Followers')
plt.legend(loc='upper left')
Out[20]:
In [21]:
#bots[bots.listedcount>10000]
condition = (bots.screen_name.str.contains("bot", case=False)==True)|(bots.description.str.contains("bot", case=False)==True)|(bots.location.isnull())|(bots.verified==False)
bots['screen_name_binary'] = (bots.screen_name.str.contains("bot", case=False)==True)
bots['location_binary'] = (bots.location.isnull())
bots['verified_binary'] = (bots.verified==False)
bots.shape
Out[21]:
In [22]:
condition = (nonbots.screen_name.str.contains("bot", case=False)==False)| (nonbots.description.str.contains("bot", case=False)==False) |(nonbots.location.isnull()==False)|(nonbots.verified==True)
nonbots['screen_name_binary'] = (nonbots.screen_name.str.contains("bot", case=False)==False)
nonbots['location_binary'] = (nonbots.location.isnull()==False)
nonbots['verified_binary'] = (nonbots.verified==True)
nonbots.shape
Out[22]:
In [28]:
df = pd.concat([bots, nonbots])
df.shape
Out[28]:
In [29]:
df.corr(method='spearman')
Out[29]:
In [32]:
plt.figure(figsize=(8,4))
sns.heatmap(df.corr(method='spearman'), cmap='coolwarm', annot=True)
plt.tight_layout()
plt.show()
Result:
In [33]:
#filepath = 'https://raw.githubusercontent.com/jubins/ML-TwitterBotDetection/master/FinalCode/kaggle_data/'
filepath = 'C:/Users/jubin/Documents/GitHub/ML-TwitterBotDetection/FinalProjectAndCode/kaggle_data/'
file= open(filepath+'training_data_2_csv_UTF.csv', mode='r', encoding='utf-8', errors='ignore')
training_data = pd.read_csv(file)
bag_of_words_bot = r'bot|b0t|cannabis|tweet me|mishear|follow me|updates every|gorilla|yes_ofc|forget' \
r'expos|kill|clit|bbb|butt|fuck|XXX|sex|truthe|fake|anony|free|virus|funky|RNA|kuck|jargon' \
r'nerd|swag|jack|bang|bonsai|chick|prison|paper|pokem|xx|freak|ffd|dunia|clone|genie|bbb' \
r'ffd|onlyman|emoji|joke|troll|droop|free|every|wow|cheese|yeah|bio|magic|wizard|face'
training_data['screen_name_binary'] = training_data.screen_name.str.contains(bag_of_words_bot, case=False, na=False)
training_data['name_binary'] = training_data.name.str.contains(bag_of_words_bot, case=False, na=False)
training_data['description_binary'] = training_data.description.str.contains(bag_of_words_bot, case=False, na=False)
training_data['status_binary'] = training_data.status.str.contains(bag_of_words_bot, case=False, na=False)
In [39]:
training_data['listed_count_binary'] = (training_data.listed_count>20000)==False
features = ['screen_name_binary', 'name_binary', 'description_binary', 'status_binary', 'verified', 'followers_count', 'friends_count', 'statuses_count', 'listed_count_binary', 'bot']
In [128]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
X = training_data[features].iloc[:,:-1]
y = training_data[features].iloc[:,-1]
dt = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=50, min_samples_split=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
dt = dt.fit(X_train, y_train)
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)
print("Trainig Accuracy: %.5f" %accuracy_score(y_train, y_pred_train))
print("Test Accuracy: %.5f" %accuracy_score(y_test, y_pred_test))
In [130]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})
scores_train = dt.predict_proba(X_train)
scores_test = dt.predict_proba(X_test)
y_scores_train = []
y_scores_test = []
for i in range(len(scores_train)):
y_scores_train.append(scores_train[i][1])
for i in range(len(scores_test)):
y_scores_test.append(scores_test[i][1])
fpr_dt_train, tpr_dt_train, _ = roc_curve(y_train, y_scores_train, pos_label=1)
fpr_dt_test, tpr_dt_test, _ = roc_curve(y_test, y_scores_test, pos_label=1)
plt.plot(fpr_dt_train, tpr_dt_train, color='darkblue', label='Train AUC: %5f' %auc(fpr_dt_train, tpr_dt_train))
plt.plot(fpr_dt_test, tpr_dt_test, color='red', ls='--', label='Test AUC: %5f' %auc(fpr_dt_test, tpr_dt_test))
plt.title("Decision Tree ROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
Out[130]:
Result: Decision Tree gives very good performance and generalizes well. But it may be overfitting as AUC is 0.937, so we will try other models.
In [131]:
from sklearn.naive_bayes import MultinomialNB
X = training_data[features].iloc[:,:-1]
y = training_data[features].iloc[:,-1]
mnb = MultinomialNB(alpha=0.0009)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
mnb = mnb.fit(X_train, y_train)
y_pred_train = mnb.predict(X_train)
y_pred_test = mnb.predict(X_test)
print("Trainig Accuracy: %.5f" %accuracy_score(y_train, y_pred_train))
print("Test Accuracy: %.5f" %accuracy_score(y_test, y_pred_test))
In [132]:
sns.set_style("whitegrid", {'axes.grid' : False})
scores_train = mnb.predict_proba(X_train)
scores_test = mnb.predict_proba(X_test)
y_scores_train = []
y_scores_test = []
for i in range(len(scores_train)):
y_scores_train.append(scores_train[i][1])
for i in range(len(scores_test)):
y_scores_test.append(scores_test[i][1])
fpr_mnb_train, tpr_mnb_train, _ = roc_curve(y_train, y_scores_train, pos_label=1)
fpr_mnb_test, tpr_mnb_test, _ = roc_curve(y_test, y_scores_test, pos_label=1)
plt.plot(fpr_mnb_train, tpr_mnb_train, color='darkblue', label='Train AUC: %5f' %auc(fpr_mnb_train, tpr_mnb_train))
plt.plot(fpr_mnb_test, tpr_mnb_test, color='red', ls='--', label='Test AUC: %5f' %auc(fpr_mnb_test, tpr_mnb_test))
plt.title("Multinomial NB ROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
Out[132]:
Result: Clearly, Multinomial Niave Bayes peforms poorly and is not a good choice as the Train AUC is just 0.556 and Test is 0.555.
In [133]:
from sklearn.ensemble import RandomForestClassifier
X = training_data[features].iloc[:,:-1]
y = training_data[features].iloc[:,-1]
rf = RandomForestClassifier(criterion='entropy', min_samples_leaf=100, min_samples_split=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
rf = rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
print("Trainig Accuracy: %.5f" %accuracy_score(y_train, y_pred_train))
print("Test Accuracy: %.5f" %accuracy_score(y_test, y_pred_test))
In [134]:
sns.set_style("whitegrid", {'axes.grid' : False})
scores_train = rf.predict_proba(X_train)
scores_test = rf.predict_proba(X_test)
y_scores_train = []
y_scores_test = []
for i in range(len(scores_train)):
y_scores_train.append(scores_train[i][1])
for i in range(len(scores_test)):
y_scores_test.append(scores_test[i][1])
fpr_rf_train, tpr_rf_train, _ = roc_curve(y_train, y_scores_train, pos_label=1)
fpr_rf_test, tpr_rf_test, _ = roc_curve(y_test, y_scores_test, pos_label=1)
plt.plot(fpr_rf_train, tpr_rf_train, color='darkblue', label='Train AUC: %5f' %auc(fpr_rf_train, tpr_rf_train))
plt.plot(fpr_rf_test, tpr_rf_test, color='red', ls='--', label='Test AUC: %5f' %auc(fpr_rf_test, tpr_rf_test))
plt.title("Random ForestROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
Out[134]:
In [139]:
class twitter_bot(object):
def __init__(self):
pass
def perform_train_test_split(df):
msk = np.random.rand(len(df)) < 0.75
train, test = df[msk], df[~msk]
X_train, y_train = train, train.ix[:,-1]
X_test, y_test = test, test.ix[:, -1]
return (X_train, y_train, X_test, y_test)
def bot_prediction_algorithm(df):
# creating copy of dataframe
train_df = df.copy()
# performing feature engineering on id and verfied columns
# converting id to int
train_df['id'] = train_df.id.apply(lambda x: int(x))
#train_df['friends_count'] = train_df.friends_count.apply(lambda x: int(x))
train_df['followers_count'] = train_df.followers_count.apply(lambda x: 0 if x=='None' else int(x))
train_df['friends_count'] = train_df.friends_count.apply(lambda x: 0 if x=='None' else int(x))
#We created two bag of words because more bow is stringent on test data, so on all small dataset we check less
if train_df.shape[0]>600:
#bag_of_words_for_bot
bag_of_words_bot = r'bot|b0t|cannabis|tweet me|mishear|follow me|updates every|gorilla|yes_ofc|forget' \
r'expos|kill|clit|bbb|butt|fuck|XXX|sex|truthe|fake|anony|free|virus|funky|RNA|kuck|jargon' \
r'nerd|swag|jack|bang|bonsai|chick|prison|paper|pokem|xx|freak|ffd|dunia|clone|genie|bbb' \
r'ffd|onlyman|emoji|joke|troll|droop|free|every|wow|cheese|yeah|bio|magic|wizard|face'
else:
# bag_of_words_for_bot
bag_of_words_bot = r'bot|b0t|cannabis|mishear|updates every'
# converting verified into vectors
train_df['verified'] = train_df.verified.apply(lambda x: 1 if ((x == True) or x == 'TRUE') else 0)
# check if the name contains bot or screenname contains b0t
condition = ((train_df.name.str.contains(bag_of_words_bot, case=False, na=False)) |
(train_df.description.str.contains(bag_of_words_bot, case=False, na=False)) |
(train_df.screen_name.str.contains(bag_of_words_bot, case=False, na=False)) |
(train_df.status.str.contains(bag_of_words_bot, case=False, na=False))
) # these all are bots
predicted_df = train_df[condition] # these all are bots
predicted_df.bot = 1
predicted_df = predicted_df[['id', 'bot']]
# check if the user is verified
verified_df = train_df[~condition]
condition = (verified_df.verified == 1) # these all are nonbots
predicted_df1 = verified_df[condition][['id', 'bot']]
predicted_df1.bot = 0
predicted_df = pd.concat([predicted_df, predicted_df1])
# check if description contains buzzfeed
buzzfeed_df = verified_df[~condition]
condition = (buzzfeed_df.description.str.contains("buzzfeed", case=False, na=False)) # these all are nonbots
predicted_df1 = buzzfeed_df[buzzfeed_df.description.str.contains("buzzfeed", case=False, na=False)][['id', 'bot']]
predicted_df1.bot = 0
predicted_df = pd.concat([predicted_df, predicted_df1])
# check if listed_count>16000
listed_count_df = buzzfeed_df[~condition]
listed_count_df.listed_count = listed_count_df.listed_count.apply(lambda x: 0 if x == 'None' else x)
listed_count_df.listed_count = listed_count_df.listed_count.apply(lambda x: int(x))
condition = (listed_count_df.listed_count > 16000) # these all are nonbots
predicted_df1 = listed_count_df[condition][['id', 'bot']]
predicted_df1.bot = 0
predicted_df = pd.concat([predicted_df, predicted_df1])
#remaining
predicted_df1 = listed_count_df[~condition][['id', 'bot']]
predicted_df1.bot = 0 # these all are nonbots
predicted_df = pd.concat([predicted_df, predicted_df1])
return predicted_df
def get_predicted_and_true_values(features, target):
y_pred, y_true = twitter_bot.bot_prediction_algorithm(features).bot.tolist(), target.tolist()
return (y_pred, y_true)
def get_accuracy_score(df):
(X_train, y_train, X_test, y_test) = twitter_bot.perform_train_test_split(df)
# predictions on training data
y_pred_train, y_true_train = twitter_bot.get_predicted_and_true_values(X_train, y_train)
train_acc = metrics.accuracy_score(y_pred_train, y_true_train)
#predictions on test data
y_pred_test, y_true_test = twitter_bot.get_predicted_and_true_values(X_test, y_test)
test_acc = metrics.accuracy_score(y_pred_test, y_true_test)
return (train_acc, test_acc)
def plot_roc_curve(df):
(X_train, y_train, X_test, y_test) = twitter_bot.perform_train_test_split(df)
# Train ROC
y_pred_train, y_true = twitter_bot.get_predicted_and_true_values(X_train, y_train)
scores = np.linspace(start=0.01, stop=0.9, num=len(y_true))
fpr_train, tpr_train, threshold = metrics.roc_curve(y_pred_train, scores, pos_label=0)
plt.plot(fpr_train, tpr_train, label='Train AUC: %5f' % metrics.auc(fpr_train, tpr_train), color='darkblue')
#Test ROC
y_pred_test, y_true = twitter_bot.get_predicted_and_true_values(X_test, y_test)
scores = np.linspace(start=0.01, stop=0.9, num=len(y_true))
fpr_test, tpr_test, threshold = metrics.roc_curve(y_pred_test, scores, pos_label=0)
plt.plot(fpr_test,tpr_test, label='Test AUC: %5f' %metrics.auc(fpr_test,tpr_test), ls='--', color='red')
#Misc
plt.xlim([-0.1,1])
plt.title("Reciever Operating Characteristic (ROC)")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
plt.show()
if __name__ == '__main__':
start = time.time()
filepath = 'https://raw.githubusercontent.com/jubins/ML-TwitterBotDetection/master/FinalProjectAndCode/kaggle_data/'
train_df = pd.read_csv(filepath + 'training_data_2_csv_UTF.csv')
test_df = pd.read_csv(filepath + 'test_data_4_students.csv', sep='\t')
print("Train Accuracy: ", twitter_bot.get_accuracy_score(train_df)[0])
print("Test Accuracy: ", twitter_bot.get_accuracy_score(train_df)[1])
#predicting test data results
predicted_df = twitter_bot.bot_prediction_algorithm(test_df)
#plotting the ROC curve
twitter_bot.plot_roc_curve(train_df)
In [138]:
plt.figure(figsize=(14,10))
(X_train, y_train, X_test, y_test) = twitter_bot.perform_train_test_split(df)
#Train ROC
y_pred_train, y_true = twitter_bot.get_predicted_and_true_values(X_train, y_train)
scores = np.linspace(start=0, stop=1, num=len(y_true))
fpr_botc_train, tpr_botc_train, threshold = metrics.roc_curve(y_pred_train, scores, pos_label=0)
#Test ROC
y_pred_test, y_true = twitter_bot.get_predicted_and_true_values(X_test, y_test)
scores = np.linspace(start=0, stop=1, num=len(y_true))
fpr_botc_test, tpr_botc_test, threshold = metrics.roc_curve(y_pred_test, scores, pos_label=0)
#Train ROC
plt.subplot(2,2,1)
plt.plot(fpr_botc_train, tpr_botc_train, label='Our Classifier AUC: %5f' % metrics.auc(fpr_botc_train,tpr_botc_train), color='darkblue')
plt.plot(fpr_rf_train, tpr_rf_train, label='Random Forest AUC: %5f' %auc(fpr_rf_train, tpr_rf_train))
plt.plot(fpr_dt_train, tpr_dt_train, label='Decision Tree AUC: %5f' %auc(fpr_dt_train, tpr_dt_train))
plt.plot(fpr_mnb_train, tpr_mnb_train, label='MultinomialNB AUC: %5f' %auc(fpr_mnb_train, tpr_mnb_train))
plt.title("Training Set ROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
#Test ROC
plt.subplot(2,2,2)
plt.plot(fpr_botc_test,tpr_botc_test, label='Our Classifier AUC: %5f' %metrics.auc(fpr_botc_test,tpr_botc_test), color='darkblue')
plt.plot(fpr_rf_test, tpr_rf_test, label='Random Forest AUC: %5f' %auc(fpr_rf_test, tpr_rf_test))
plt.plot(fpr_dt_test, tpr_dt_test, label='Decision Tree AUC: %5f' %auc(fpr_dt_test, tpr_dt_test))
plt.plot(fpr_mnb_test, tpr_mnb_test, label='MultinomialNB AUC: %5f' %auc(fpr_mnb_test, tpr_mnb_test))
plt.title("Test Set ROC Curve")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc='lower right')
plt.tight_layout()
In [ ]: